
//-----------------------------------------------------------------------
//
// Gentrification and pioneer businesses
// Kristian Behrens, Brahim Boualam, Julien Martin, and Florian Mayneris
//
// First version: 17/02/2015
// This  version: 10/11/2021
//
// Preparation of the NHGIS census variables for New York
//
//-----------------------------------------------------------------------


// Do-file for concording geographical data, and for apportioning data
// using Census Blocks (either with population weights or proportional to the
// surface of the blocks)


//THIS IS FOR THE RACIAL VARIABLES, BROKEN DOWN FROM BLOCK GROUP TO BLOCK
//WE NEED TO CHANGE THE 2000/2010 PROCEDURE, SINCE BLOCK IS DIRECTLY AVAILABLE


cd "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL/"


clear
clear matrix

set more off


// This is the list of counties for the NY MSA (we use the 2010 definition)
#delimit ; 

local listcountyNYMSA
34003	36005	36027	34013	34017	34019	36047	34023	34025	34027
36059	36061	34029	36071	34031	42103	36079	36081	36085	36087
34035	36103	34037	34039	36119;

local listvar
001 002 003 004 005;

local years
90 00 10;

#delimit cr 



// --------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 1990 BLOCKGROUP DATA

// STEP 1


insheet using "data/census/nhgis_race_bgroup/nhgis0005_ds120_1990_blck_grp.csv", delimit(",")

gen temp1 = ""
gen temp2 = ""
gen temp3 = ""
gen temp4 = ""
gen geoid90 = ""

gen test = substr(gisjoin, 2, 100)

if (strlen(gisjoin) == 13) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 4)
	replace temp4 = substr(test, 12, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

if (strlen(gisjoin) == 15) {
	replace temp1 = substr(test, 1, 2)
	replace temp2 = substr(test, 4, 3)
	replace temp3 = substr(test, 8, 6)
	replace temp4 = substr(test, 14, 1)
	replace geoid90 = temp1 + temp2 + temp3 + temp4
}

drop temp* test

sort geoid90
drop if geoid90[_n-1] == geoid90[_n]

keep geoid90 euy001 euy002 euy003 euy004 euy005
ren euy001 pop_white90
gen pop_other90 = euy003 + euy005
ren euy002 pop_black90
ren euy004 pop_asian90

drop euy003 euy005

// Keep only NYMSA counties
gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid90 pop_white90 pop_black90 pop_asian90 pop_other90
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2

// Merge with the 1990 geography files
use "data/geography/geog_new_york/geography1990.dta", clear


// Required for 1990 data, takes care of the block suffix
gen full_bg_id = ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 2) if blocksuffix90 == ""
replace full_bg_id = substr(geoid90, 1, strlen(geoid90) - 3) if blocksuffix90 != ""

gen ctid = substr(geoid90, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid90 geoid90_back
ren full_bg_id geoid90

sort geoid90

merge m:1 geoid90 using "temp/temp_variables.dta"

tab _merge
drop if _merge == 2
drop _merge

ren geoid90 geoid90_blgroup
ren geoid90_back geoid90

egen pop_all90 = sum(pop90), by(geoid90_blgroup)
egen blgroup_surface = sum(aland90), by(geoid90_blgroup)


// STEP 3: BREAKING THE DATA DOWN BY POPULATION SHARE

gen pop_share = 0
replace pop_share = pop90/pop_all90 if pop_all90 != 0

gen pop_white90_block = pop_white90*pop_share
gen pop_black90_block = pop_black90*pop_share
gen pop_asian90_block = pop_asian90*pop_share
gen pop_other90_block = pop_other90*pop_share

drop pop_share

gen year = 1990
keep year geoid90 pop_white90_block pop_black90_block pop_asian90_block pop_other90_block

ren geoid90 geoid
ren pop_white90_block pop_white_block
ren pop_black90_block pop_black_block
ren pop_asian90_block pop_asian_block
ren pop_other90_block pop_other_block

order geoid year pop_white_block pop_black_block pop_asian_block pop_other_block

save "temp/race1990.dta", replace


// END: MATCHING OF NHGIS 1990 BLOCKGROUP DATA
// --------------------------------------------------------


clear


// --------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2000 BLOCK DATA


// STEP 1: GET THE BLOCK DATA

insheet using "data/census/nhgis_race_block/nhgis0008_ds147_2000_block.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 10)
gen geoid00 = temp1 + temp2 + temp3 

drop test temp*
keep geoid00 fxw001 fxw002 fxw003 fxw004 fxw005 fxw006
ren fxw001 pop_white00_block
ren fxw002 pop_black00_block
ren fxw004 pop_asian00_block
gen pop_other00_block = fxw003 + fxw005 + fxw006

drop fxw003 fxw005 fxw006


// Keep only NYMSA counties
gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid00 pop_white00 pop_black00 pop_asian00 pop_other00
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2: MERGE WITH THE BLOCK GEOGRAPHY FILES

use "data/geography/geog_new_york/geography2000.dta", clear

gen ctid = substr(geoid00, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid00 geoid00_back
gen geoid00 = substr(geoid00_back, 1, 15)

sort geoid00

merge 1:1 geoid00 using "temp/temp_variables.dta"

tab _merge
drop _merge

gen year = 2000
keep year geoid00 pop_white00_block pop_black00_block pop_asian00_block pop_other00_block

ren geoid00 geoid
ren pop_white00_block pop_white_block
ren pop_black00_block pop_black_block
ren pop_asian00_block pop_asian_block
ren pop_other00_block pop_other_block

order geoid year pop_white_block pop_black_block pop_asian_block pop_other_block

save "temp/race2000.dta", replace


// END: MATCHING OF NHGIS 2000 BLOCK DATA
// --------------------------------------------------------


clear



// --------------------------------------------------------
// BEGIN: MATCHING OF NHGIS 2010 BLOCK DATA

// STEP 1: GET THE BLOCK DATA

insheet using "data/census/nhgis_race_block/nhgis0010_ds172_2010_block.csv", delimit(",")

gen test = substr(gisjoin, 2, 100)
gen temp1 = substr(test, 1, 2)
gen temp2 = substr(test, 4, 3)
gen temp3 = substr(test, 8, 10)
gen geoid10 = temp1 + temp2 + temp3

drop test temp*
keep geoid10 h7x002 h7x003 h7x004 h7x005 h7x006 h7x007
ren h7x002 pop_white10_block
ren h7x003 pop_black10_block
ren h7x005 pop_asian10_block
gen pop_other10_block = h7x004 + h7x006 + h7x007

drop h7x004 h7x006 h7x007

// Keep only NYMSA counties
gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1

order geoid10 pop_white10_block pop_black10_block pop_asian10_block pop_other10_block
drop ctid temp
save "temp/temp_variables.dta", replace


// STEP 2

// Merge with the 2010 geography files
use "data/geography/geog_new_york/geography2010.dta", clear

gen ctid = substr(geoid10, 1, 5)
gen temp = 0

foreach i in `listcountyNYMSA' {
	replace temp = 1 if ctid == "`i'"
}

keep if temp == 1
drop temp ctid

ren geoid10 geoid10_back
gen geoid10 = substr(geoid10_back, 1, 15)

sort geoid10

merge m:1 geoid10 using "temp/temp_variables.dta"

tab _merge
drop _merge

gen year = 2010
keep year geoid10 pop_white10_block pop_black10_block pop_asian10_block pop_other10_block

ren geoid10 geoid
ren pop_white10_block pop_white_block
ren pop_black10_block pop_black_block
ren pop_asian10_block pop_asian_block
ren pop_other10_block pop_other_block

order geoid year pop_white_block pop_black_block pop_asian_block pop_other_block

save "temp/race2010.dta", replace


// END: MATCHING OF NHGIS 2010 BLOCK DATA
// --------------------------------------------------------



// Assembling the files


append using "temp/race2000.dta" "temp/race1990.dta"

sort year geoid

save "results/NewYork/census_variables_race.dta", replace


// Add the data from the original 1990 census STF files, which report
// block-level racial composition


// Adjust the geoid codes for 1990
// The 1990 block data comes from the NHGIS, with NHGIS identifiers
// The concordance data comes from the Census with the Census identifiers

// We need to transform the NHGIS identifiers (in the geography1990.dta
// files to Census identifiers)

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 3) if strlen(geoid) == 12
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 12

drop test*

gen test1 = substr(geoid, 1, 9)
gen test2 = substr(geoid, 10, 4) if strlen(geoid) == 13
gen test3 = test1 + "00" + test2
replace  geoid = test3 if strlen(geoid) == 13

drop test*
	
merge 1:1 geoid year using "/Users/kristianbehrens/Desktop/GENTRI_RESTAT_FINAL/data/census/racial_pop1990_block.dta"

drop _merge

drop pop_white_block pop_black_block pop_asian_block

ren pop_white_block_direct pop_white_block
ren pop_black_block_direct pop_black_block
ren pop_asian_block_direct pop_asian_block

save "results/NewYork/census_variables_race.dta", replace

